TIPS
Write commands in code editor of R Studio and run them using icon
Run in R Studio.
apply
function vs for loopWe’ll compute the mean expression of all genes two ways. The
apply function is much faster than using a for
loop. We will be using the system.time() command which
tells us how long it took to run a command.
The apply function returns a named vector, so here is a
quick reminder on how named vectors work.
# Create a vector
test_vec <- c(53, 15, 78)
# create a named vector by adding labels (names)
names(test_vec) <- c("Control", "TreatA", "TreatB")
# now we can access the values by name
test_vec["TreatA"]
## TreatA
## 15
apply vs. for loopWe will be computing the mean expression across a number of genes in a given file.
gene_exp <- read.table("http://wd.cri.uic.edu/advanced_R/TCGA.SKCM.20.txt",
header=T, row.names=1)
dim(gene_exp)
## [1] 20501 20
gene_exp[1:6, 1:2]
apply functionsystem.time(mean_gene_exp <- apply(gene_exp, 1, mean))
## user system elapsed
## 0.107 0.011 0.119
head(mean_gene_exp)
## A1BG A1CF A2BP1 A2LD1 A2ML1 A2M
## 7.66027247 0.00000000 0.04663362 6.39365297 1.54585053 14.59680606
for loop. NOTE: Make sure you type out the
whole system.time and loop code before executing.N_genes <- nrow(gene_exp)
mean_gene_exp <- vector()
system.time(for (i in 1:N_genes){
mean_gene_exp[i] <- mean(as.numeric(gene_exp[i, ]))
})
## user system elapsed
## 2.701 0.000 2.707
head(mean_gene_exp)
## [1] 7.66027247 0.00000000 0.04663362 6.39365297 1.54585053 14.59680606
The loop is MUCH slower, but the result is the same. However
note that the for loop did not return a named vector.
rbind function to combine datasets row by rowdata1 <- read.table("http://wd.cri.uic.edu/advanced_R/data1.txt", header=T)
data1
data2 <- read.table("http://wd.cri.uic.edu/advanced_R/data2.txt", header=T)
data2
# use `rbind` to combine datasets row by row
data1_2 <- rbind(data1, data2)
data1_2
cbind function to combine datasets column by
columndata3 <- read.table("http://wd.cri.uic.edu/advanced_R/data3.txt", header=T)
data3
# use `cbind` to combine datasets column by column
# Note that The subject IDs must be in the same order in the two tables.
data1_3 <- cbind(data1, data3[, 2:3])
data1_3
merge function to intersect datasetsdata4 <- read.table("http://wd.cri.uic.edu/advanced_R/data4.txt", header=T)
data4
data5 <- read.table("http://wd.cri.uic.edu/advanced_R/data5.txt", header=T)
data5
# use `merge` to obtain intersection of datasets
# The subject IDs don't need to be in the same order.
data_intersection <- merge(data4, data5, by="ID")
data_intersection
merge function to make union of datasets# use `merge` to obtain union of datasets with `all=T`
# The subject IDs don't need to be in the same order.
data_union <- merge(data4, data5, by="ID", all=T)
data_union
# list of visits per patient
data6 <- read.table("http://wd.cri.uic.edu/advanced_R/data6.txt", header=T)
data6
# check the number of replicates for each "ID" in data6
table(data6$ID)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# compare that with the number of replicate for each "ID" in data4
table(data4$ID)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# merge data6 and data4 by "ID"
data_visits <- merge(data6, data4, by="ID")
data_visits
Add a new column to an existing data frame. We will add the new
column Agecat (age category) and we will use the
cut command to assign the levels. For cut, the following
arguments were provided.
Inf).labels argument defines the labels for each
bin.data_union$Agecat <- cut(data_union$Age, c(0, 60, Inf),
labels=c("Young", "Old"))
subset function to select subset of data# select subset of data (young and female)
data_sub <- subset(data_union, Agecat=="Young" & Sex=="F")
data_sub
grepsub or gsub function to substitute
stringsampleIDs <- colnames(read.table("http://wd.cri.uic.edu/advanced_R/TCGA.small.txt",
header=T, row.names=1))
sampleIDs
## [1] "TCGA.D3.A3MR.06A.11R.A21D.07" "TCGA.D3.A8GI.06A.11R.A37K.07"
## [3] "TCGA.D9.A3Z1.06A.11R.A239.07" "TCGA.EE.A2GM.06B.11R.A18S.07"
sub or gsub function to substitute
string# `sub` only substitutes the first match.
# "fixed=T": string to be matched as it is
sub(".", "-", sampleIDs, fixed=T)
## [1] "TCGA-D3.A3MR.06A.11R.A21D.07" "TCGA-D3.A8GI.06A.11R.A37K.07"
## [3] "TCGA-D9.A3Z1.06A.11R.A239.07" "TCGA-EE.A2GM.06B.11R.A18S.07"
# `gsub` substitutes all matches.
gsub(".", "-", sampleIDs, fixed=T)
## [1] "TCGA-D3-A3MR-06A-11R-A21D-07" "TCGA-D3-A8GI-06A-11R-A37K-07"
## [3] "TCGA-D9-A3Z1-06A-11R-A239-07" "TCGA-EE-A2GM-06B-11R-A18S-07"
# remove argument "fixed=T": string to be matched as a regular expression
# "." as a regular expression means any character
gsub(".", "-", sampleIDs)
## [1] "----------------------------" "----------------------------"
## [3] "----------------------------" "----------------------------"
"\\.[0-9]+" means to match “.” following with one or more
digits# First we will show the original values
sampleIDs
## [1] "TCGA.D3.A3MR.06A.11R.A21D.07" "TCGA.D3.A8GI.06A.11R.A37K.07"
## [3] "TCGA.D9.A3Z1.06A.11R.A239.07" "TCGA.EE.A2GM.06B.11R.A18S.07"
# Then we will perform the substitution
gsub("\\.[0-9]+", "???", sampleIDs)
## [1] "TCGA.D3.A3MR???A???R.A21D???" "TCGA.D3.A8GI???A???R.A37K???"
## [3] "TCGA.D9.A3Z1???A???R.A239???" "TCGA.EE.A2GM???B???R.A18S???"
grep to search string.# search for sample IDs containing "06A", `grep` will return the indices
grep("06A", sampleIDs)
## [1] 1 2 3
# obtain values using the indices returned by `grep`
sampleIDs[grep("06A", sampleIDs)]
## [1] "TCGA.D3.A3MR.06A.11R.A21D.07" "TCGA.D3.A8GI.06A.11R.A37K.07"
## [3] "TCGA.D9.A3Z1.06A.11R.A239.07"
# use "value=T" to obtain values directly
grep("06A", sampleIDs, value=T)
## [1] "TCGA.D3.A3MR.06A.11R.A21D.07" "TCGA.D3.A8GI.06A.11R.A37K.07"
## [3] "TCGA.D9.A3Z1.06A.11R.A239.07"
grep to search string using a regular
expression# use a regular expression:
# "D[0-9]+" means to match "D" following with one or more digits
grep("D[0-9]+", sampleIDs, value=T)
## [1] "TCGA.D3.A3MR.06A.11R.A21D.07" "TCGA.D3.A8GI.06A.11R.A37K.07"
## [3] "TCGA.D9.A3Z1.06A.11R.A239.07"
# just print the rows with subjects in their 50s: match 5 plus any number
data4 <- read.table("http://wd.cri.uic.edu/advanced_R/data4.txt", header=T)
data4[grep("5[0-9]",data4$Age),]
# Load the messy table from a tab-delimited text file
patients <- read.table("http://wd.cri.uic.edu/advanced_R/clinical.data.txt",
header=T, sep="\t")
# Or load the messy table from a Microsoft Excel file
# Install with install.packages("openxlsx") if needed
library(openxlsx)
patients <- read.xlsx("https://wd.cri.uic.edu/advanced_R/clinical.data.xlsx",
sheet=1)
# preview the first 15 rows
head(patients, 15)
# fix patient IDs (i.e. rownames of data frame)
# remove "-KP" at the end; "$" means to match at the end of string
patients$PatientID <- sub("-KP$", "", patients$PatientID)
head(patients)
# replace "-" with "."
patients$PatientID <- gsub("-", ".", patients$PatientID, fixed=T)
head(patients)
# Age: Change missing values with the average age in all patients.
# preview the first 10 rows
head(patients, 10)
# is.na() will test which elements are missing values.
# na.rm=T: missing values (NA) should be stripped before the computation proceeds.
patients$Age[is.na(patients$Age)] <- mean(patients$Age, na.rm=T)
head(patients, 10)
# check unique names for gender
unique(patients$Gender)
## [1] "Male" "F" "M" "Female"
# the table() command will give us the counts
table(patients$Gender)
##
## F Female M Male
## 27 1 24 3
# change "Female" to "F"; change "Male" to "M"
patients$Gender <- factor(patients$Gender)
levels(patients$Gender)
## [1] "F" "Female" "M" "Male"
levels(patients$Gender)[2] <- "F"
levels(patients$Gender)
## [1] "F" "M" "Male"
levels(patients$Gender)[3] <- "M"
levels(patients$Gender)
## [1] "F" "M"
unique(patients$Gender)
## [1] M F
## Levels: F M
# preview the first 10 rows
head(patients, 10)
NA.# change "." or "" to NA (missing value)
# preview the first 15 rows
patients[1:15, ]
unique(patients$Disease)
## [1] "0" "1" NA "."
patients$Disease <- factor(patients$Disease, levels=c("0", "1"))
unique(patients$Disease)
## [1] 0 1 <NA>
## Levels: 0 1
head(patients, 15)
unique(patients$Genotype)
## [1] "MT" "Mutant" "Wild-type" "WT" "mt" NA
## [7] "Wildtype" "Wild type"
# Use a substitution the ".*" at the end of the pattern will match all of the text.
patients$Genotype <- sub("^w.*", "WT", patients$Genotype, ignore.case=T)
# Change using a grep, i.e. select all of the values that
# match the pattern and set their value.
patients$Genotype[grep("^m", patients$Genotype, ignore.case=T)] <- "MT"
# Convert to a factor
patients$Genotype <- factor(patients$Genotype)
unique(patients$Genotype)
## [1] MT WT <NA>
## Levels: MT WT
# preview the first 10 rows
head(patients, 10)
# write the cleaned data into a file.
# remove the index column using row.names=F; output missing values (NA) as empty string
write.table(patients, file="clean.clinical.data.txt", quote=F, sep = "\t",
row.names=F, na="")
NOTE: If you have not previously installed ggplot2 then
install from CRAN.
install.packages("ggplot2")
Once the ggplot2 package is installed make sure to load
for this exercise
# load the package
library(ggplot2)
Read in the data for this exercise. This is a table of cell statistics from a single-cell RNA-seq project. The columns in this table are:
sc <- read.table("http://wd.cri.uic.edu/advanced_R/scRNA_cells.txt",
header=T,row.names=1,sep="\t")
head(sc)
Scatterplots: we will make various versions of a UMAP plot.
ggplot(sc, aes(x = UMAP_1, y = UMAP_2))
ggplot(sc, aes(x = UMAP_1, y = UMAP_2)) + geom_point()
ggplot(sc, aes(x = UMAP_1, y = UMAP_2)) + geom_point(size=0.5, color="red")
ggplot(sc, aes(x = UMAP_1, y = UMAP_2)) +
geom_point(shape=24, size=0.5, fill="yellow")
Based on gene expression of a gene of interest:
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Foxp3)) +
geom_point()
Based on cluster:
Note that clusters are numbers, so we use factor() to tell ggplot to treat this as a categorical variable.
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster))) +
geom_point()
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster), shape = Genotype)) +
geom_point()
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster), shape = Genotype,
size=UMIs/Genes)) + geom_point()
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster), size=UMIs/Genes)) +
geom_point() + facet_grid( ~ Genotype)
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster), size=UMIs/Genes)) +
geom_point() + facet_grid(HasTCR ~ Genotype, labeller = label_both)
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster), size=UMIs/Genes)) +
geom_point() + facet_grid(HasTCR ~ Genotype, labeller = label_both) +
labs(title="UMAP plot", caption="My scRNA-seq project",
x="UMAP dim 1", y="UMAP dim 2",
color="Cluster", size="UMIs per gene")
basePlot <- ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = factor(Cluster), size=UMIs/Genes))
basePlotTitle <- labs(title="UMAP plot", caption="My scRNA-seq project",
x="UMAP dim 1", y="UMAP dim 2",
color="Cluster", size="UMIs per gene")
basePlot + geom_point() + facet_grid(HasTCR ~ Genotype) + basePlotTitle
If you have closed RStudio, make sure to reload the
ggplot2 package.
# load the package
library(ggplot2)
We will do some comparisons of the Foxp3 levels across different clusters.
ggplot(sc, aes(x=factor(Cluster), y=Foxp3))
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + geom_boxplot()
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + geom_boxplot(aes(fill=factor(Cluster)))
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) +
geom_boxplot(aes(fill=factor(Cluster)),
outlier.fill="red", outlier.color="red",
outlier.shape=8, outlier.size=2)
boxParam <- geom_boxplot(aes(fill=factor(Cluster)), outlier.fill="red",
outlier.color="red", outlier.shape=8, outlier.size=2)
boxTitle <- labs(title="Foxp3 expression vs cluster",
y="Foxp3", x="Cluster",
caption="My scRNA-seq project", fill="Cluster")
boxTheme <- theme(plot.title = element_text(hjust=0.5),
legend.background = element_rect(fill="darkgray"))
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + boxParam +
boxTitle + boxTheme
boxTheme <- theme_classic() + boxTheme
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + boxParam +
boxTitle + boxTheme
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + boxParam +
boxTitle + boxTheme + facet_grid( ~ Genotype)
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + boxParam +
boxTitle + boxTheme + facet_grid(rows=vars(Genotype))
ggplot(sc, aes(x=factor(Cluster), y=Foxp3)) + geom_violin(aes(fill=factor(Cluster))) +
boxTitle + boxTheme + facet_grid(rows=vars(Genotype))
First we need to make a “long” format of gene expression levels vs cluster.
# make a new data frame with just the clusters, and the gene expression levels
gene_expr <- sc[,c(7,10:17)]
head(gene_expr)
# convert to the "long" format. Load the tidyr library if you haven't already.
library(tidyr)
# This will convert all the columns to long format, except for `Cluster`
# which be retained in the "long" data.frame.
# The column names will be stored in the new column `name` and
# the values in the column `value`. These are the defaults for pivot_longer
gene_expr_long <- pivot_longer(gene_expr, !Cluster)
head(gene_expr_long)
Plot all of the genes at once.
Note: x aesthetic is NA. We’re making one plot per panel, but then a grid of panels based on cluster and gene (“name” column).
ggplot(gene_expr_long, aes(x=NA,y=value,fill=factor(Cluster))) +
geom_violin() + facet_grid(Cluster ~ name)
Make it prettier:
ggplot(gene_expr_long, aes(x=NA,y=value,fill=factor(Cluster))) +
geom_violin() + facet_grid(Cluster ~ name) +
theme_void() + coord_flip() +
theme(legend.position = "none") +
theme(strip.text.x = element_text(angle = 45))
If you have closed RStudio, make sure to reload the
ggplot2 package.
# load the package
library(ggplot2)
NOTE: geom_bar() will make a barplot by
based on the number of cases in each group, so it will count all of the
cells per cluster for us. geom_col() will make a barplot
just based on values provided in a data frame.
We’ll look at the number of cells in each cluster.
Omit the y aesthetic, as geom_bar() will count the number of entries for each value in x.
ggplot(sc, aes(x=factor(Cluster))) + geom_bar()
Alternatively, we could count the cells first and plot the counts:
cluster_counts <- table(sc$Cluster)
cluster_counts
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 949 891 809 718 687 566 526 486 310 303 212 179 106 97
cluster_counts_df <- data.frame(cluster_counts)
cluster_counts_df
ggplot(cluster_counts_df, aes(x=Var1, y=Freq)) + geom_col()
ggplot(sc, aes(x=factor(Cluster), fill=Genotype)) + geom_bar()
Or if we want the bars side-by-side:
ggplot(sc, aes(x=factor(Cluster), fill=Genotype)) + geom_bar(position="dodge")
barTitle <- labs(title="Number of cells per cluster", x="Cluster",
y="Number of cells", caption="My scRNA-seq project")
ggplot(sc, aes(x=factor(Cluster), fill=Genotype)) + geom_bar() + barTitle
ggplot(sc, aes(x=factor(Cluster))) + geom_bar() + barTitle +
facet_grid(Genotype ~ Batch)
ggplot(sc,aes(x=factor(Cluster),fill=HasTCR)) + geom_bar() + barTitle +
facet_grid(Genotype ~ Batch)
ggplot(sc,aes(x=factor(Cluster),fill=HasTCR)) + geom_bar() + barTitle +
facet_grid(Genotype ~ Batch) +
scale_fill_manual(values=c("red", "darkgreen"))
If you have closed RStudio, make sure to reload the
ggplot2 package.
# load the package
library(ggplot2)
For this exercise we will recreate the following scatter plot from the mtcars data set using the built-in plot functions in R. The plot will have the following features.
wt (x axis)
vs. mpg (y axis)cyl as a factor.am as a factor.hp /
wtam
and cyl as factors.ggplot2basePlot <- ggplot(mtcars, aes(wt, mpg, color=factor(cyl), shape=factor(am),
size=(hp/wt)))
basePlotTitle <- labs(title="MPG Vs Number of cylinders", y="Miles per Gallon",
x="Weight (1000 lbs)", caption="Source: R MTCARS Dataset",
shape="Auto vs Man", color="Num of cyl", size="gross hp/wt")
basePlot + geom_point() + facet_grid(am ~ cyl, labeller = label_both) + basePlotTitle
mtcars.cyl.point_colors <- factor(mtcars$cyl)
colors_legend <- levels(point_colors)
levels(point_colors) <- rainbow(length(levels(point_colors)))
mtcarsam.point_shapes <- factor(mtcars$am)
shapes_legend <- levels(point_shapes)
levels(point_shapes) <- c(16:18,0:17)[1:length(levels(point_shapes))]
shapes_legend_levels <- as.numeric(levels(point_shapes))
point_shapes <- as.numeric(as.character(point_shapes))
mtcarshp /
wtpoint_sizes <- mtcars$hp / mtcars$wt
size_range <- range(point_sizes) # Will return minimum and maximum value of vector
size_conv <- (size_range[2] - size_range[1] ) / 2 # Will range sizes over 1-3 (2 units)
point_sizes <- ( ( point_sizes - size_range[1] ) / size_conv ) + 1
# Set the legend breaks approximately every 0.5 point change, rounded to 10s
size_legend <- seq(floor(size_range[1] / 10) * 10, ceiling(size_range[2] / 10) * 10,
by=ceiling(size_conv / 20 ) * 10)
size_legend_cex <- ( ( size_legend - size_range[1]) / size_conv ) + 1
wt as x-axis and mpg as y-axistype="p" indicates ploting pointscol parameterpch parametercex parametermain), caption (sub), axis
labels (xlab and ylab)# Set the margins to allow room for the legend on the right hand side
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
# Create the basic plot
plot(mtcars$wt, mtcars$mpg, type="p",
col=as.character(point_colors), pch=point_shapes, cex=point_sizes,
main="MPG Vs Number of cylinders",
xlab="Weight (1000 lbs)",
ylab="Miles per Gallon",
sub="Source: R MTCARS Dataset")
# Set the spacing for the legends. There are three so, space each one a third the way
# on the y axis
legend_spacing <- ( max(mtcars$mpg) - min(mtcars$mpg) ) / 3
# Add the legend for colors
legend(max(mtcars$wt) + 0.25, ( 3 * legend_spacing ) + min(mtcars$mpg),
legend = colors_legend, col=levels(point_colors),
pch=16, title="Num of cyl")
# Add the legend for shapes
legend(max(mtcars$wt) + 0.25, (2 * legend_spacing ) + min(mtcars$mpg),
legend = shapes_legend,
pch=shapes_legend_levels,
title="Auto vs. Man")
# Add the legend for sizes
legend(max(mtcars$wt) + 0.25, legend_spacing + min(mtcars$mpg),
legend = size_legend,
pch=16, pt.cex=size_legend_cex,
title="gross hp/wt")
The layout function allows one to create separate plot
areas. We will use this function to create a plot area for each sub-plot
(facet).
# Get the number of levels for each factor
am_levels <- unique(sort(mtcars$am))
cyl_levels <- unique(sort(mtcars$cyl))
# Create the layout matrix
layout_mat <- matrix(seq(1, length(am_levels) * length(cyl_levels)),
nrow=length(am_levels),
ncol=length(cyl_levels), byrow = T)
# Look at the layout matrix
layout_mat
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
# Add a column for the legends
layout_mat <- cbind(layout_mat,
rep((length(cyl_levels) * length(am_levels) + 1), nrow(layout_mat)))
# Look at the final layout matrix
layout_mat
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 7
## [2,] 4 5 6 7
# Set outer margins for the title, subtitle (caption) and axis labels
par(oma = c(4, 2, 3, 0))
# Setup the layout using the layout matrix
# Set the widths so that each plot is 1 share and the legend is a half (0.5 share)
layout(mat=layout_mat, widths=c(rep(1, length(cyl_levels)), 0.5))
# Create a data frame of the plot styles. Will use this when subsetting for each subplot
plot_styles <- data.frame(col=as.character(point_colors),
pch=point_shapes,
cex=point_sizes, stringsAsFactors = F)
# Loop over all values of am and cyl to create each plot.
for ( am_value in am_levels ) {
for ( cyl_value in cyl_levels ) {
# Set the margins of the subplot
par(mar = c(2,2,2,2))
# Create an empty plot with the same x and y scale.
plot(mtcars$wt, mtcars$mpg, type="n",
main=sprintf("am=%d, cyl=%d", am_value, cyl_value))
# Subset the data (mtcars) and plot styles.
mtcars_subset <- mtcars[mtcars$am == am_value & mtcars$cyl == cyl_value, ]
plot_styles_subset <- plot_styles[mtcars$am == am_value & mtcars$cyl == cyl_value, ]
# Add the data for this subplot
points(mtcars_subset$wt, mtcars_subset$mpg,
col=plot_styles_subset$col,
pch=plot_styles_subset$pch,
cex=plot_styles_subset$cex)
}
}
# Create a new plot for the legends
par(mar = c(10,1,10,1))
plot(1, type="n", axes=F, xlab="", ylab="")
# Put the colors legend at the top of the area
legend(x="top",
legend = colors_legend, col=levels(point_colors),
pch=16, title="Num of cyl")
# Put the shapes legend in the center of the area
legend(x="center",
legend = shapes_legend,
pch=shapes_legend_levels,
title="Auto vs. Man")
# Put the size legend are the bottom of the area
legend(x="bottom",
legend = size_legend,
pch=16, pt.cex=size_legend_cex,
title="gross hp/wt")
# Add the title and subtitle (caption)
mtext("MPG Vs Number of cylinders", outer = TRUE, cex = 1.5)
mtext("Source: R MTCARS Dataset", outer = TRUE, side=1, cex=0.75, line=3)
# Add the axis labels
mtext("Weight (1000 lbs)", outer=TRUE, cex=1, line=1, side=1)
mtext("Miles per Gallon", outer=TRUE, cex=1, side=2)
NOTE: If you have not previously installed doBy then
install from CRAN.
install.packages("doBy")
Once the doBy package is installed make sure to load for
this exercise
# load the package
library(doBy)
bw_data <- read.table("http://wd.cri.uic.edu/advanced_R/birth_weight.txt",
header=T)
head(bw_data)
Use apply to summarize descriptive statistics
# obtain mean for all variables using `apply` function
# the 2nd argument "2" means column wise.
apply(bw_data, 2, mean)
## bwt gestation parity age height weight
## 119.4625213 279.1013629 0.2623509 27.2282794 64.0494037 128.4787053
## smoke
## 0.3909710
# obtain median for all variables using `apply` function
apply(bw_data, 2, median)
## bwt gestation parity age height weight smoke
## 120 280 0 26 64 125 0
# obtain standard deviation for all variables using `apply` function
apply(bw_data, 2, sd)
## bwt gestation parity age height weight smoke
## 18.3286714 16.0103051 0.4400999 5.8178387 2.5261015 20.7342822 0.4881759
Use summaryBy to summarize groupwise statistics
# use "summaryBy" to obtain groupwise statistics
summaryBy(bwt~smoke, data=bw_data, FUN=c(mean, sd, min, median, max))
smoke bwt.mean bwt.sd bwt.min bwt.median bwt.max
1 0 123.0853 17.42370 55 123 176
2 1 113.8192 18.29501 58 115 163
summaryBy(bwt~parity, data=bw_data, FUN=c(mean, sd, min, median, max))
parity bwt.mean bwt.sd bwt.min bwt.median bwt.max
1 0 119.9423 18.66204 55 120 174
2 1 118.1136 17.31515 63 118 176
Use table to create a contingency table
# build a contingency table of the counts at each combination of factor levels.
# parity: 0 - child first born, 1 - otherwise
# smoke: 0 - mother is not a smoker, 1 - smoker
table(bw_data$parity, bw_data$smoke, dnn=c("parity", "smoke"))
## smoke
## parity 0 1
## 0 525 341
## 1 190 118
If you have closed RStudio, make sure to reload the
ggplot2 package.
# load the package
library(ggplot2)
bw_data <- read.table("http://wd.cri.uic.edu/advanced_R/birth_weight.txt", header=T)
ggplot to create a histogram of baby’s birth
weight# check if it is a bell shape in the histogram of baby's birth weight
ggplot(bw_data, aes(x=bwt)) +
geom_histogram(binwidth=10, fill="red", color="black") +
labs(title="Histogram of baby's birth weight")
shapiro.test to check normality of baby’s birth
weightCaution: a large sample size will usually result in a significant p value in Shapiro test, because any tiny deviation from normality will be detected. We should also look at how far W is from 1 to see how big the deviation is. For reasonably small differences, the normality assumption is probably still OK.
# there are 1174 data points
length(bw_data$bwt)
## [1] 1174
shapiro.test(bw_data$bwt)
##
## Shapiro-Wilk normality test
##
## data: bw_data$bwt
## W = 0.99563, p-value = 0.001917
The p-value is <0.05, but W is quite close to 1, so we’re probably OK. Look at the Q-Q plot also.
# use QQ-normal plot to check the normality.
# if the data is normally distributed, the points in the QQ-normal plot will
# lie on a straight diagonal line.
qqnorm(bw_data$bwt)
# qqline shows a line for a "theoretical" normal distribution
qqline(bw_data$bwt, col="red")
shapiro.test to check normality of a raw count of a
gene from RNA-seq# get raw read count data
raw.count <- read.table("http://wd.cri.uic.edu/advanced_R/raw.count.txt",
header=T, row.names=1)
# convert to the gene's raw read count to a numeric vector
count <- as.numeric(raw.count[1, ])
# there are 46 data points
length(count)
## [1] 46
# check if it is a bell shape in the histogram of raw read counts
hist(count, main="Histogram of raw read counts", col="red")
shapiro.test(count)
##
## Shapiro-Wilk normality test
##
## data: count
## W = 0.49027, p-value = 2.014e-11
qqnorm(count)
# qqline adds a line for a "theoretical" normal distribution
qqline(count, col="red")
# create a new column for mother's age category
bw_data$agecat <- cut(bw_data$age,
breaks=c(0,23,30,Inf), labels=c("Young", "MidAged", "Elder"))
head(bw_data)
# create a boxplot: baby's birth weight ~ mother's three age categories.
boxTitle <- labs(title="Baby's birth weight vs. mother's age category",
x="Mother's age category", y="Baby's birth weight (ounce)", fill="Age category")
ggplot(bw_data, aes(x=agecat, y=bwt)) + geom_boxplot(aes(fill=agecat)) + boxTitle
# One way ANOVA to test the baby's birth weight among mother's three age categories.
summary(aov(bwt ~ agecat, data=bw_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## agecat 2 728 364.1 1.084 0.339
## Residuals 1171 393330 335.9
# Redirect this to a variable so we can save the statistics.
anova.stats <- summary(aov(bwt ~ agecat, data=bw_data))
anova.pvalues <- anova.stats[[1]][["Pr(>F)"]]
anova.pvalues
## [1] 0.3386306 NA
# label smoking status: 0 - non-smoker; 1 - smoker
bw_data$smoke <- factor(bw_data$smoke, labels=c("Non-smoker", "Smoker"))
# create a boxplot using facet grid of smoke (0 or 1) and age category
ggplot(bw_data, aes(x=agecat, y=bwt, group=agecat)) +
geom_boxplot(aes(fill=agecat)) + facet_grid( ~ smoke)
# Two way ANOVA: agecat and smoke
summary(aov(bwt ~ agecat + smoke, data=bw_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## agecat 2 728 364 1.153 0.316
## smoke 1 23894 23894 75.671 <2e-16 ***
## Residuals 1170 369436 316
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Two way ANOVA with an interaction term
summary(aov(bwt ~ agecat * smoke, data=bw_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## agecat 2 728 364 1.153 0.316
## smoke 1 23894 23894 75.674 <2e-16 ***
## agecat:smoke 2 642 321 1.017 0.362
## Residuals 1168 368794 316
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# save as in a variable
twoway.stats <- summary(aov(bwt ~ agecat * smoke, data=bw_data))
# see p-values
twoway.stats[[1]][["Pr(>F)"]]
## [1] 3.160488e-01 1.118213e-17 3.618961e-01 NA
If you have closed RStudio, make sure to execute the following.
# load the package
library(ggplot2)
bw_data <- read.table("http://wd.cri.uic.edu/advanced_R/birth_weight.txt", header=T)
# create a new column for mother's age category
bw_data$agecat[bw_data$age > 30] <- "Elder"
bw_data$agecat[bw_data$age > 23 & bw_data$age <= 30] <- "MidAged"
bw_data$agecat[bw_data$age <= 23] <- "Young"
# change agecat to a factor
bw_data$agecat <- factor(bw_data$agecat, levels=c("Young", "MidAged", "Elder"))
# create a boxplot for baby's birthweight vs. mother's smoking status
boxTitle <- labs(title="Baby's birth weight vs. mother's smoking status",
y="Baby's birth weight (ounce)", x="Mother's smoking status", fill="Smoking status")
ggplot(bw_data, aes(x=factor(smoke), y=bwt) ) +
geom_boxplot( aes(fill = factor(smoke))) + boxTitle
# Use non-parametric test to test baby's birthweight vs. mother's smoking status
wilcox.test(bwt ~ smoke, data=bw_data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bwt by smoke
## W = 212970, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# save as a variable
wilcox.stats <- wilcox.test(bwt ~ smoke, data=bw_data)
# p-value
wilcox.stats$p.value
## [1] 6.485236e-18
head(bw_data)
# Use non-parametric test to test baby's birthweight vs. mother's age category
kruskal.test(bwt ~ agecat, data=bw_data)
##
## Kruskal-Wallis rank sum test
##
## data: bwt by agecat
## Kruskal-Wallis chi-squared = 2.6045, df = 2, p-value = 0.2719
# create a scatter plot to see relationship between mother's height and weight
ggplot(bw_data, aes(x=height, y=weight)) +
geom_point(size=2, color="red") +
geom_smooth(formula=y~x,method = "lm", se = T) +
labs(title="Correlation between mother's height and weight",
x="height", y="weight")
geom_smooth() adds a smoothed conditional means to the plot
# Pearson correlation
cor.test(bw_data$height, bw_data$weight)
##
## Pearson's product-moment correlation
##
## data: bw_data$height and bw_data$weight
## t = 16.552, df = 1172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3877305 0.4805332
## sample estimates:
## cor
## 0.4352874
# save as a variable
pearson.stats <- cor.test(bw_data$height, bw_data$weight)
# p-value
pearson.stats$p.value
## [1] 1.837015e-55
# correlation coefficient
pearson.stats$estimate
## cor
## 0.4352874
# Spearman correlation
cor.test(bw_data$height, bw_data$weight, method="spearman")
##
## Spearman's rank correlation rho
##
## data: bw_data$height and bw_data$weight
## S = 134713533, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5004735
# Kendall correlation
cor.test(bw_data$height, bw_data$weight, method="kendall")
##
## Kendall's rank correlation tau
##
## data: bw_data$height and bw_data$weight
## z = 18.02, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.3743995
# simple linear model
model <- lm(bwt ~ gestation, data=bw_data) # fit the regression model
# display model results
summary(model)
##
## Call:
## lm(formula = bwt ~ gestation, data = bw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.348 -11.065 0.218 10.101 57.704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.75414 8.53693 -1.26 0.208
## gestation 0.46656 0.03054 15.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.74 on 1172 degrees of freedom
## Multiple R-squared: 0.1661, Adjusted R-squared: 0.1654
## F-statistic: 233.4 on 1 and 1172 DF, p-value: < 2.2e-16
# create a scatter plot for baby's bwt vs gestation
ggplot(bw_data, aes(x=gestation, y=bwt)) +
geom_point(size=2, color="red") +
geom_smooth(formula=y~x,method = "lm", se = F) +
labs(title="Baby's birth weight ~ gestation", x="gestation (days)",
y="Baby's birth weight (ounce)")
# There will be four plots from plot(model). The 2nd one is Q-Q plot.
# The Q-Q plot show that residuals are normally distributed,
# which means that the data meets normality assumptions of linear regression.
plot(model, 2)
# general linear model for multiple factors
model <- lm(bwt ~ gestation + weight + factor(smoke), data=bw_data)
# display model results
summary(model)
##
## Call:
## lm(formula = bwt ~ gestation + weight + factor(smoke), data = bw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.920 -10.759 -0.279 9.743 51.354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.62648 8.69128 -2.028 0.0428 *
## gestation 0.44809 0.02936 15.261 < 2e-16 ***
## weight 0.11818 0.02267 5.213 2.2e-07 ***
## factor(smoke)1 -8.07789 0.96444 -8.376 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.07 on 1170 degrees of freedom
## Multiple R-squared: 0.2335, Adjusted R-squared: 0.2315
## F-statistic: 118.8 on 3 and 1170 DF, p-value: < 2.2e-16
# create scatter plot for baby's bwt vs gestation, facet by smoke
gg1 <- ggplot(bw_data, aes(x=gestation, y=bwt)) +
geom_point(size=2, color="red") +
geom_smooth(formula=y~x,method = "lm", se = F) +
labs(title="Baby's birth weight ~ gestation", x="gestation (days)",
y="Baby's birth weight (ounce)")
gg1 + facet_grid(~smoke)
# create scatter plot for baby's bwt vs weight, facet by smoke
gg2 <- ggplot(bw_data, aes(x=weight, y=bwt)) +
geom_point(size=2, color="red") +
geom_smooth(formula=y~x,method = "lm", se = F) +
labs(title="Baby's birth weight ~ mother's weight", x="Mother's weight (pounds)",
y="Birth weight (ounce)")
gg2 + facet_grid(~smoke)
Is there a significant association between women’s age at first birth and breast cancer status?
| <20 | 20-24 | 25-29 | 30-34 | >34 | |
|---|---|---|---|---|---|
| Cancer | 320 | 1206 | 1011 | 463 | 220 |
| No cancer | 1422 | 4432 | 2893 | 1092 | 406 |
# read the data into a data frame
age_cancer <- read.table("http://wd.cri.uic.edu/advanced_R/age_cancer.txt",
header=T, sep="\t", row.names=1, check.names=F)
age_cancer
chisq.test(age_cancer)
##
## Pearson's Chi-squared test
##
## data: age_cancer
## X-squared = 130.34, df = 4, p-value < 2.2e-16
Make a bar plot of the fraction of cancer vs non-cancer patients in each age category.
# normalize counts to percent (across each row)
age_cancer_percent <- age_cancer / rowSums(age_cancer)
# add a column for the groups
age_cancer_percent$Group <- rownames(age_cancer_percent)
# Convert to long format
library(tidyr)
age_cancer_long <- pivot_longer(age_cancer_percent, ! Group,
names_to = "Age", values_to = "Fraction")
# Set the Age orders to be the same order as the columns in the original file.
# This will set the order in the plot.
age_cancer_long$Age <- ordered(age_cancer_long$Age,
levels=colnames(age_cancer))
# make the plot
ggplot(age_cancer_long, aes(x=Age, y=Fraction, fill=Group)) +
geom_col(position='dodge')
A bit more programming: write a loop to run a post-hoc with Fisher’s Exact test for each age group. Note: you do not need to type the comments in the loop, but they are there to explain what each command is doing.
# store number of columns
cols <- ncol(age_cancer)
# start an empty data frame
age_stats <- data.frame()
# run a test for each column (age group) one at a time
for(i in 1:cols){
# get the counts for this age group
age.counts <- age_cancer[,i]
# get the counts for all other age groups
age.other <- rowSums(age_cancer[,1:cols != i])
# we're specifically seeing if THIS age group has a different distribution
# of cancer vs non-cancer patients, using fisher's exact test
age.table <- cbind(age.counts, age.other)
fet <- fisher.test(age.table)
# combine the estimate (odds ratio) and p-value) into the data frame
# and use the age group as the row name
age_stats <- rbind(age_stats, c(fet$estimate, fet$p.value) )
rownames(age_stats)[nrow(age_stats)] <- colnames(age_cancer)[i]
}
# set the column names, and add log2-scaled odds ratio and FDR corrected p-value
colnames(age_stats) <- c("OddsRatio","P.Value")
age_stats$Log2OddsRatio = log2(age_stats$OddsRatio)
age_stats$Q.Value = p.adjust(age_stats$P.Value,method="fdr")
age_stats
The young age groups have odds ratios <1: lower fraction in cancer groups, which we can also see in the bar plot. The biggest deviation, based on the log2 odds ratio, is for the oldest group. All of the groups have statistically significant differences.
Exercise: do the same thing with an apply statement. We use sapply to loop over all elements of a vector, in this case the names of the columns.
age_stats2 <- sapply(colnames(age_cancer),function(x){
age.counts <- age_cancer[[x]]
age.other <- rowSums(age_cancer[,colnames(age_cancer) != x])
age.table <- cbind(age.counts, age.other)
fet <- fisher.test(age.table)
return(c(fet$estimate, fet$p.value))
})
# the result will have the groups in the columns
# transpose and save as a data frame
age_stats2 <- as.data.frame(t(age_stats2))
# add in the other columns as before
colnames(age_stats2) <- c("OddsRatio","P.Value")
age_stats2$Log2OddsRatio = log2(age_stats2$OddsRatio)
age_stats2$Q.Value = p.adjust(age_stats2$P.Value,method="fdr")
age_stats2
NOTE: If you have not previously installed openxlsx then
install from CRAN.
install.packages("openxlsx")
openxlsxlibrary(openxlsx)
sheet_1 <- read.xlsx("http://wd.cri.uic.edu/advanced_R/taxa_relative.xlsx", sheet = 1)
sheet_1[1:10, 1:5]
sheet_L6 <- read.xlsx("http://wd.cri.uic.edu/advanced_R/taxa_relative.xlsx", sheet = "L6")
sheet_L6[1:10, 1:5]
# Load the "messy" data table from earlier
patients <- read.xlsx("http://wd.cri.uic.edu/advanced_R/clinical.data.xlsx", sheet = 1)
# Create a workbook with two empty sheets
wb <- createWorkbook()
addWorksheet(wb, "original")
addWorksheet(wb, "clean")
# Write the "messy" data table to the sheet named original
writeData(wb, sheet="original", x=patients)
# Load the "clean" data we saved earlier
clean_clinical_data <- read.delim("clean.clinical.data.txt")
writeData(wb, sheet="clean", x=clean_clinical_data)
# Save the workbook to an XLSX file
saveWorkbook(wb, "clinical_data.xlsx", overwrite=T)
The following are a set of additional examples that you can try on your own.
The viridis scales provide color maps that are perceptually uniform in both color and black-and-white. They are also designed to be perceived by viewers with common forms of color blindness.
There are three sets of viridis color/fill functions that can be used
based on the type of data. The functions also have an
option parameter that can be used to designate the viridis
palette to use
scale_color_viridis_d/scale_fill_viridis_d
- Provides colors for discrete data (factors)scale_color_viridis_c/scale_fill_viridis_c
- Provides a color scale for continuous datascale_color_viridis_b/scale_fill_viridis_b
- Will bin continuous data prior to generating a color scaleFor all of these exercises, we will be using the plot examples from
the ggplot2 scatter plots. We will setup the basePlot
variable that has the basic details of the plot and then we will this
variable using different color scales.
sc <- read.table("http://wd.cri.uic.edu/advanced_R/scRNA_cells.txt",
header=T,row.names=1,sep="\t")
basePlot <- ggplot(sc, aes(x = UMAP_1, y = UMAP_2)) + geom_point() +
labs(title="UMAP plot", caption="My scRNA-seq project",
x="UMAP dim 1", y="UMAP dim 2")
basePlot + aes(color=factor(Cluster)) + labs(color="Cluster") +
scale_color_viridis_d()
basePlot + aes(color=factor(Cluster)) + labs(color="Cluster") +
scale_color_viridis_d(option="turbo")
basePlot + aes(color = Foxp3) + labs(color="Foxp3 expression") +
scale_color_viridis_c()
basePlot + aes(color = Foxp3) + labs(color="Foxp3 expression") +
scale_color_viridis_c(option="plasma")
basePlot + aes(color = Foxp3) + labs(color="Foxp3 expression") +
scale_color_viridis_b(option="plasma")
The brewer scales provide sequential, diverging and qualitative color schemes from ColorBrewer. These are particularly well suited to display discrete values on a map. The following palettes are available. Please note that these palettes have a set number of defined colors and if the number of items being colors is larger than the size of the palette the additional items will not be colored.
For this execise, we will recreate the violin plot for all genes from the ggplot2 - Box and violin plot exercise. First we need to make a “long” format of gene expression levels vs cluster. We will modifiy the original plot slighly and color the plot by gene.
# make a new data frame with just the clusters, and the gene expression levels
gene_expr <- sc[,c(7,10:17)]
# convert to the "long" format. Load the tidyr library if you haven't already.
library(tidyr)
gene_expr_long <- pivot_longer(gene_expr, !Cluster)
basePlot <- ggplot(gene_expr_long, aes(x=NA,y=value,fill=name)) +
geom_violin() + facet_grid(Cluster ~ name) +
theme_void() + coord_flip() +
theme(legend.position = "none") +
theme(strip.text.x = element_text(angle = 45))
basePlot + scale_fill_brewer(palette = "Spectral")
basePlot + scale_fill_brewer(palette = "Set1")
basePlot + scale_fill_brewer(palette = "YlGnBu")
dplyrThis exercise is the same as exercise 3.1. However, instead of using
doBy this shows how to perform the same analysis using the
dplyr package. The dplyr package has a number
of tools that can be used to transform data. For more information please
visit their website https://dplyr.tidyverse.org/
install.packages("dply")
Once the dplyr package is installed make sure to load
for this exercise
# load the package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bw_data <- read.table("http://wd.cri.uic.edu/advanced_R/birth_weight.txt",
header=T)
head(bw_data)
Use summarize to summarize descriptive statistics.
summarize method will generate a summary using the
provide function(s).across method will run the function(s) across the
value in the specified columns.everything method specify all columns in the
data.# Compute the mean values for each column
bw_data %>% summarize(across(everything(), mean))
# Compute the median value for each column
bw_data %>% summarize(across(everything(), median))
# Compute the standard deviation (sd) for each column
bw_data %>% summarize(across(everything(), sd))
# Do all three in a single operation.
# In the list the name (text before the equal sign) will be added as a suffix to
# the column name
bw_data %>% summarize(across(everything(), list(mean=mean, median=median, sd=sd)))
Use group_by to summarize group-wise statistics
bw_data %>% group_by(smoke) %>%
summarize(bwt_mean=mean(bwt),
bwt_sd=sd(bwt),
bwt_min=min(bwt),
bwt_median=median(bwt),
bwt_max=max(bwt))
Group by “parity” but, use the across function to limit
the amount of typing.
bw_data %>% group_by(parity) %>%
summarize(across(bwt, list(mean=mean, sd=sd, min=min, median=median, max=max)))
Group by “parity” and “smoke”.
bw_data %>% group_by(parity, smoke) %>%
summarize(across(bwt, list(mean=mean, sd=sd, min=min, median=median, max=max)))
## `summarise()` has grouped output by 'parity'. You can override using the
## `.groups` argument.